About the data:

Series ID: GSE21933 Platform: Phalanx Human OneAray v5 GPL6254

Experiment type: Expression profiling by Array Details:

Phalanx Biotech Group’s Human OneArray v5 contains 32,048 features, 30968 detection probes and 1080 control probes, spotted onto glass slides using a proprietary non-contact printing method. Detection probes are annotated against the human genome and grouped into the following categories:

Lo FY, Chang JW, Chang IS, Chen YJ et al. The database of chromosome imbalance regions and genes resided in lung cancer from Asian and Caucasian identified by array-comparative genomic hybridization. BMC Cancer 2012 Jun 12;12:235. PMID: 22691236

Data and Normalization

data_id <- "GSE21933"

eset <-getGEO(data_id) #get dataset
## Found 1 file(s)
## GSE21933_series_matrix.txt.gz
gse <- eset[[1]] #get gene dataset

# head(pData(gse)) #phenotype data
# head(fData(gse)) #feature data
# head(exprs(gse)) #complete expression data set

summary(exprs(gse)) #statistical analysis of all summary()
##    GSM545615         GSM545616         GSM545617          GSM545618      
##  Min.   : 0.1536   Min.   : 0.4077   Min.   : 0.04938   Min.   : 0.5312  
##  1st Qu.: 3.8381   1st Qu.: 3.8453   1st Qu.: 3.84864   1st Qu.: 3.8799  
##  Median : 5.8976   Median : 5.8950   Median : 5.89495   Median : 5.8941  
##  Mean   : 6.2761   Mean   : 6.2757   Mean   : 6.27665   Mean   : 6.2752  
##  3rd Qu.: 8.4292   3rd Qu.: 8.4269   3rd Qu.: 8.42835   3rd Qu.: 8.4281  
##  Max.   :15.9989   Max.   :15.9989   Max.   :15.99895   Max.   :15.9989  
##    GSM545619        GSM545620         GSM545621         GSM545622       
##  Min.   : 0.283   Min.   : 0.3436   Min.   : 0.1726   Min.   : 0.01868  
##  1st Qu.: 3.819   1st Qu.: 3.8353   1st Qu.: 3.8568   1st Qu.: 3.84881  
##  Median : 5.894   Median : 5.8965   Median : 5.8949   Median : 5.89586  
##  Mean   : 6.277   Mean   : 6.2760   Mean   : 6.2766   Mean   : 6.27655  
##  3rd Qu.: 8.430   3rd Qu.: 8.4286   3rd Qu.: 8.4292   3rd Qu.: 8.42895  
##  Max.   :15.999   Max.   :15.9989   Max.   :15.9989   Max.   :15.99893  
##    GSM545623         GSM545624         GSM545625           GSM545626     
##  Min.   : 0.2089   Min.   : 0.2229   Min.   : 0.008928   Min.   : 0.039  
##  1st Qu.: 3.8365   1st Qu.: 3.8332   1st Qu.: 3.852787   1st Qu.: 3.845  
##  Median : 5.8907   Median : 5.8982   Median : 5.895908   Median : 5.896  
##  Mean   : 6.2757   Mean   : 6.2759   Mean   : 6.275721   Mean   : 6.276  
##  3rd Qu.: 8.4298   3rd Qu.: 8.4313   3rd Qu.: 8.429130   3rd Qu.: 8.428  
##  Max.   :15.9989   Max.   :15.9989   Max.   :15.998947   Max.   :15.999  
##    GSM545627         GSM545628          GSM545629         GSM545630      
##  Min.   : 0.0598   Min.   : 0.01137   Min.   : 0.2113   Min.   : 0.1679  
##  1st Qu.: 3.8114   1st Qu.: 3.82077   1st Qu.: 3.8270   1st Qu.: 3.8655  
##  Median : 5.9008   Median : 5.89057   Median : 5.8864   Median : 5.8968  
##  Mean   : 6.2769   Mean   : 6.27676   Mean   : 6.2747   Mean   : 6.2761  
##  3rd Qu.: 8.4302   3rd Qu.: 8.43115   3rd Qu.: 8.4314   3rd Qu.: 8.4304  
##  Max.   :15.9989   Max.   :15.99893   Max.   :15.9989   Max.   :15.9989  
##    GSM545631          GSM545632        GSM545633          GSM545634        
##  Min.   : 0.02635   Min.   : 0.260   Min.   : 0.05111   Min.   : 0.009472  
##  1st Qu.: 3.83963   1st Qu.: 3.867   1st Qu.: 3.85254   1st Qu.: 3.840950  
##  Median : 5.89913   Median : 5.900   Median : 5.89552   Median : 5.894485  
##  Mean   : 6.27604   Mean   : 6.275   Mean   : 6.27769   Mean   : 6.277425  
##  3rd Qu.: 8.42879   3rd Qu.: 8.428   3rd Qu.: 8.43166   3rd Qu.: 8.430225  
##  Max.   :15.99893   Max.   :15.999   Max.   :15.99890   Max.   :15.998947  
##    GSM545635        GSM545636        GSM545637         GSM545638      
##  Min.   : 0.000   Min.   : 0.523   Min.   : 0.3539   Min.   : 0.3566  
##  1st Qu.: 3.844   1st Qu.: 3.834   1st Qu.: 3.8428   1st Qu.: 3.8392  
##  Median : 5.894   Median : 5.889   Median : 5.9023   Median : 5.8975  
##  Mean   : 6.277   Mean   : 6.273   Mean   : 6.2740   Mean   : 6.2766  
##  3rd Qu.: 8.429   3rd Qu.: 8.427   3rd Qu.: 8.4290   3rd Qu.: 8.4297  
##  Max.   :15.999   Max.   :15.999   Max.   :15.9989   Max.   :15.9989  
##    GSM545639         GSM545640          GSM545641         GSM545642       
##  Min.   : 0.2665   Min.   : 0.08381   Min.   : 0.4029   Min.   : 0.05743  
##  1st Qu.: 3.8352   1st Qu.: 3.87108   1st Qu.: 3.8324   1st Qu.: 3.83935  
##  Median : 5.8931   Median : 5.89739   Median : 5.8930   Median : 5.89279  
##  Mean   : 6.2754   Mean   : 6.27513   Mean   : 6.2761   Mean   : 6.27641  
##  3rd Qu.: 8.4288   3rd Qu.: 8.43237   3rd Qu.: 8.4306   3rd Qu.: 8.43054  
##  Max.   :15.9989   Max.   :15.99895   Max.   :15.9989   Max.   :15.99893  
##    GSM545643         GSM545644         GSM545645          GSM545646      
##  Min.   : 0.3071   Min.   : 0.4294   Min.   : 0.03132   Min.   : 0.3196  
##  1st Qu.: 3.8600   1st Qu.: 3.8814   1st Qu.: 3.84646   1st Qu.: 3.8684  
##  Median : 5.8883   Median : 5.8968   Median : 5.89639   Median : 5.9005  
##  Mean   : 6.2759   Mean   : 6.2753   Mean   : 6.27705   Mean   : 6.2748  
##  3rd Qu.: 8.4299   3rd Qu.: 8.4290   3rd Qu.: 8.42751   3rd Qu.: 8.4301  
##  Max.   :15.9989   Max.   :15.9989   Max.   :15.99895   Max.   :15.9989  
##    GSM545647         GSM545648           GSM545649         GSM545650      
##  Min.   : 0.1531   Min.   : 0.008351   Min.   : 0.1819   Min.   : 0.3828  
##  1st Qu.: 3.8076   1st Qu.: 3.839472   1st Qu.: 3.8272   1st Qu.: 3.8240  
##  Median : 5.8911   Median : 5.896806   Median : 5.8958   Median : 5.8992  
##  Mean   : 6.2754   Mean   : 6.277289   Mean   : 6.2767   Mean   : 6.2751  
##  3rd Qu.: 8.4308   3rd Qu.: 8.430428   3rd Qu.: 8.4304   3rd Qu.: 8.4293  
##  Max.   :15.9989   Max.   :15.998947   Max.   :15.9989   Max.   :15.9989  
##    GSM545651         GSM545652          GSM545653         GSM545654       
##  Min.   : 0.1291   Min.   : 0.07773   Min.   : 0.3255   Min.   : 0.01947  
##  1st Qu.: 3.8631   1st Qu.: 3.84617   1st Qu.: 3.8466   1st Qu.: 3.85324  
##  Median : 5.8989   Median : 5.89233   Median : 5.8969   Median : 5.89586  
##  Mean   : 6.2770   Mean   : 6.27683   Mean   : 6.2754   Mean   : 6.27742  
##  3rd Qu.: 8.4321   3rd Qu.: 8.43029   3rd Qu.: 8.4304   3rd Qu.: 8.43093  
##  Max.   :15.9989   Max.   :15.99895   Max.   :15.9989   Max.   :15.99893  
##    GSM545655           GSM545656       
##  Min.   : 0.005867   Min.   : 0.04854  
##  1st Qu.: 3.838326   1st Qu.: 3.83994  
##  Median : 5.897306   Median : 5.89339  
##  Mean   : 6.276177   Mean   : 6.27702  
##  3rd Qu.: 8.429420   3rd Qu.: 8.42844  
##  Max.   :15.998928   Max.   :15.99893
boxplot(exprs(gse), outline = FALSE, col = "gold") #Visualize the normalization

The Phenotype Data contains 42 samples and 40 features for the samples. The Feature Data: The feature data contains 30967 probes and 10 features for each samples.

Looking at the summary of expressions, the values for each sample falls between 0-16. Suggesting that these values a log2 normalized. Further, the figure below shows a boxplot of all the gene count values for each sample. As seen below, all the samples have lined up unifromly suggesting that the data is already log normalized.

Inspecting clinical Data

There are several columns within the metadata many of these are repeating. The most important of these columns are the column-1 - title, and columns 36 to 40 age:ch1,histology:ch1,sex:ch1,stage:ch1,tissue:ch1. Further, most of the column names have a “:ch1” at the end. This was removed. Lastly, the histology and stage columns have no data for negative samples. This was changed to “neg”.

#Preparing sample metadata
samplesinfo <- pData(gse) #gettign sample data
samplesinfo <- samplesinfo[,c(1,36:40)] #There are multiple columns for same data, acquiring relevant metadata information.
colnames(samplesinfo) <- gsub(":ch1","",colnames(samplesinfo))

samplesinfo <- samplesinfo%>%
  mutate(across(c(3,5),~replace_na(.,"Neg")))

samplesinfo <- samplesinfo%>%
  mutate_all(~gsub(" years","",.))%>%
  mutate(across(c(histology,sex,stage,tissue),factor))%>%
  mutate(age = as.numeric(age))%>%
  mutate(tissue = ifelse(tissue == "primary normal lung tissue","normal","tumor"))

samplesinfo$histology <- relevel(samplesinfo$histology, ref = "Neg")
samplesinfo$stage <- relevel(samplesinfo$stage, ref = "Neg")
samplesinfo$tissue <- relevel(as.factor(samplesinfo$tissue), ref = "normal")

#Prepare Features Metadata 
featuresinfo <- fData(gse)
#Prepare expression data
exprs_data <- exprs(gse)

samplesinfo%>%
  group_by(tissue, stage,histology)%>%
  tally()%>%
  spread(histology,n)

The data set represents 42 lung tissue samples, 21 primary normal lung tissues and 21 primary lung tumor tissues. The tumor tissues are representing six stages of lung cancer IA,IB,IIB,IIIA,IIIB,IV. Further two hematological patterns are also represented in tumor samples, these are: adenocarcinoma (AD) and Squamous cell carcinoma (SQ).

Clustering and PCA analysis

corMatrix <- cor(exprs_data, use = "c")
rownames(samplesinfo) <- colnames(corMatrix)
pheatmap(corMatrix, annotation_col =samplesinfo[,3:6], annotation_row = samplesinfo[,3:6], cluster_rows = T, cluster_cols = T)

#Hierarchical cluster based dimension analysis
htree <- hclust(dist(t(exprs_data)), method = "average")
plot(htree) #It seems there is a specific pattern of clustering within samples. And this might be biologically relevant difference so I think it is better to makes sure we check with PCA,the variance cotnribution. 

pca_gse <- prcomp(t(exprs_data)) # Run PCA analysis

screeplot(pca_gse, npcs=min(10,length(pca_gse$sdev)),type = c("barplot","lines")) #Screeplots of all the PC;s and their cotnribution. 

# PCA plot by tissue
p1 <- cbind(samplesinfo, pca_gse$x)%>%
  ggplot(aes(x= PC1, y = PC2, col=tissue, label = paste(tissue)))+
  geom_point(size=5)+
  geom_text_repel()
  

p2 <- cbind(samplesinfo, pca_gse$x)%>%
  ggplot(aes(x= PC1, y = PC2, col=stage, label = paste(stage)))+
  geom_point()+
  geom_text_repel()

p3 <- cbind(samplesinfo, pca_gse$x)%>%
  ggplot(aes(x= PC1, y = PC2, col=histology, label = paste(histology)))+
  geom_point()+
  geom_text_repel()

p4 <- cbind(samplesinfo, pca_gse$x)%>%
  ggplot(aes(x= PC1, y = PC2, col=title, label = paste(title)))+
  geom_point(size = 5)+
  geom_text_repel(size = 5)+
  ggsci::scale_fill_nejm()


ggarrange(p1,p2,p3,p4, ncol = 2, nrow = 2, common.legend = TRUE, legend = "bottom")

Filter dataset:

First I am filtering out only genes that belong to group-6 in the array as that represents the control genes. Lowly expressed genes must be filtered out as they can deviate the results of gene expression. The cut off for low expression is median gene expression of each sample. Next, I am applying three seperate strategies: 1) All samples have a data 2) At least 50% samples have all the data 3) At least 25% of sample have a data. However, this is a microarray dataset.

Exploring cut offs:

control_list <- rownames(gse@featureData[which(gse@featureData@data$GROUP ==6)]) #create a vector list of controls
control_matrix <- exprs_data[rownames(exprs_data)%in% control_list,] #isolate a matrix of just controls

mean_control <- colMeans(control_matrix)
mean_control <- data.frame(mean_control)%>%
  rownames_to_column(var = "control")
colnames(mean_control) <- c("controls","mean")

cat(paste0("Mean of Controls= ",mean(control_matrix),"\n","Mean of data= ",mean(exprs_data),"\n", "Median of Controls= ",median(control_matrix),"\n","Median of data= ",median(exprs_data)))
## Mean of Controls= 8.31688252769473
## Mean of data= 6.27604254788615
## Median of Controls= 7.96752265
## Median of data= 5.894992
cutoff <- median(exprs_data) #take median of expression dataset
is_expressed <- exprs_data>cutoff

keep_all<- rowSums(is_expressed) >=42 #stringent should be present in all samples.
keep_50 <- rowSums(is_expressed)>=21 #should be present in at leasst 50% samples.
keep <- rowSums(is_expressed)<10 # lenient threshold present in at least 25% samples

gse_filt <- rbind(table(keep),table(keep_50),table(keep_all))
rownames(gse_filt) <- c("25%","50%","100%")
gse_filt
##      FALSE  TRUE
## 25%  17877 13090
## 50%  15596 15371
## 100% 22070  8897
gse_25 <- gse[keep,]
gse_50 <- gse[keep_50,]
gse_100 <- gse[keep_all,]

dim_table <- data.frame(SampleCount = c("25%","50%","100%"),
                        features = c(dim(gse_25)[1],dim(gse_50)[1],dim(gse_100)[1]),
                        features = c(dim(gse_25)[2],dim(gse_50)[2],dim(gse_100)[2]))

print(dim_table)
##   SampleCount features features.1
## 1         25%    13090         42
## 2         50%    15371         42
## 3        100%     8897         42
plot(density(control_matrix), main = "Expression Density", xlab = "log2 intensity",)
abline(v = c(1,6), col = c("red","blue"), lty = 2)  # typical cutoff

Filter genes:

#Filter only sampels that are group-1 or known genes 
exon_genes <- row.names(featuresinfo[which(featuresinfo$GROUP == 1),])
gse[exon_genes,]
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 19677 features, 42 samples 
##   element names: exprs 
## protocolData: none
## phenoData
##   sampleNames: GSM545615 GSM545616 ... GSM545656 (42 total)
##   varLabels: title geo_accession ... tissue:ch1 (40 total)
##   varMetadata: labelDescription
## featureData
##   featureNames: PH_hs_0000002 PH_hs_0000004 ... PH_hs_0041207 (19677
##     total)
##   fvarLabels: ID GROUP ... ORF (10 total)
##   fvarMetadata: Column Description labelDescription
## experimentData: use 'experimentData(object)'
##   pubMedIds: 22691236 
## Annotation: GPL6254
exprs_data <- exprs_data[exon_genes,]

universe_file <- featuresinfo[which(row.names(featuresinfo)%in%exon_genes),]$GENE_SYMBOL
write(universe_file, file = "../outputs/universe_WGCNA.txt")

Differential Expression

Combined effect and interaction effect

set.seed(1234)
design_combined <- model.matrix(~0+tissue+ histology+stage, data = samplesinfo)
design_interaction <- model.matrix(~tissue*histology*stage, data = samplesinfo)

AD versus SQ versus neg

design_gse <- model.matrix(~0+samplesinfo$histology)
colnames(design_gse)<-c("neg","AD","SQ")

fit <- lmFit(exprs(gse_100),design_gse) #fit all the genes
contrasts_gse <- makeContrasts( ADvsneg = AD-neg, #AD versus negative
                                SQvsneg = SQ-neg, #SQ versus negative
                                ADvsSQ = AD-SQ, #AD versus SQ
                                SQvsAD = SQ-AD, #SQ versus AD
                                avsavg = (AD+SQ)/2-neg, #negative versus combined disease how significantly different SQ and AD are from negative.
                                ADvsSQtoneg = ((AD-neg)-(SQ-neg)), #Checks if AD is significantly more different than SQ
                                levels =design_gse)
cont_names <- colnames(contrasts_gse)

result_all <- data.frame()
for (i in cont_names){
  fit_contrast <- contrasts.fit(fit, contrasts_gse[,i])
  fit_contrast <- eBayes(fit_contrast)
  results <- topTable(fit_contrast, adjust="fdr", number=Inf)
  results$Genes <- rownames(results)
  results$Contrasts<- i
  result_all <- bind_rows(result_all,results)
  
  plot <-ggplot(results, aes(x = logFC, y = -log10(adj.P.Val))) +
    geom_point(aes(color = adj.P.Val < 0.05), alpha = 0.5) +
    scale_color_manual(values = c("black", "red")) +
    labs(title = paste("Volcano Plot:", i), x = "Log2 Fold Change", y = "-Log10 Adjusted P-value") +
    theme_minimal()
  print(plot)
  ggsave(filename = paste0(i,"_volcano.png"), path = "../figures")
  
}

## Saving 7 x 5 in image

## Saving 7 x 5 in image

## Saving 7 x 5 in image

## Saving 7 x 5 in image

## Saving 7 x 5 in image

## Saving 7 x 5 in image
#volcano plot
ggplot(result_all, aes(x = logFC, y = -log10(adj.P.Val), color = Contrasts)) +
  geom_point(alpha = 0.5) +
  labs(title = "Volcano Plot for Multiple Contrasts", x = "Log2 Fold Change", y = "-Log10 Adjusted P-value") +
  theme_minimal()

ggsave(filename = "volcano_all_contrasts.png",path = "../figures")
## Saving 7 x 5 in image

WGCNA

Chose Threshold

I chose the threshold to be at 15 as this had a mean connectivity of at scale free topology index of

softPower <- 15

cat("The soft power is",softPower)
## The soft power is 15
adj_matrix <- t(exprs_data) #filtered genes present in all samples
adjacency_mat <- adjacency(adj_matrix, power = softPower)

# Turn adjacency into topological overlap
TOM <- TOMsimilarity(adjacency_mat)
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
dissTOM <- 1 - TOM

# Hierarchical clustering
geneTree <- hclust(as.dist(dissTOM), method = "average")
plot(geneTree, main = "Gene clustering")

# Module identification using dynamic tree cut deepsplit allows adjusting granulatity. 
dynamicMods <- cutreeDynamic(dendro = geneTree, distM = dissTOM,
                             deepSplit = 2, pamRespectsDendro = FALSE,
                             minClusterSize = 100) #minClustersize seems optimal with at least 100 genes per cluster. 
##  ..cutHeight not given, setting it to 0.999  ===>  99% of the (truncated) height range in dendro.
##  ..done.
cat("DYnamic tree with modules:")
## DYnamic tree with modules:
table(dynamicMods)
## dynamicMods
##    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14 
## 6965 1974 1621 1580 1263 1199  878  826  767  601  594  518  430  337  124
# Assign module colors
dynamicColors <- labels2colors(dynamicMods)
plotDendroAndColors(geneTree, dynamicColors, "Dynamic Tree Cut",
                    dendroLabels = FALSE, hang = 0.03,
                    addGuide = TRUE, guideHang = 0.05)

cat("DYnamic tree modules with colors:")
## DYnamic tree modules with colors:
table(dynamicColors)
## dynamicColors
##       black        blue       brown        cyan       green greenyellow 
##         826        1621        1580         124        1199         518 
##        grey     magenta        pink      purple         red      salmon 
##        6965         601         767         594         878         337 
##         tan   turquoise      yellow 
##         430        1974        1263
# STEP-4: Relate Modules to Traits
# Calculate eigengenes
MEs <- moduleEigengenes(adj_matrix, colors = dynamicColors)$eigengenes
rowname_MEs <- row.names(MEs)
MEs <- orderMEs(MEs)
row.names(MEs) <- rowname_MEs

# Correlate eigengenes with traits
traits <- samplesinfo %>%
  mutate(across(where(is.factor), ~as.numeric(as.factor(.)))) %>%
  select(-c("title","sex"))  # Remove non-trait columns like ID if present

#traits <- model.matrix(~ histology + sex + stage + tissue + age, data = samplesinfo)[,-1]


moduleTraitCor <- cor(MEs,traits, use = "p")
moduleTraitPvalue <- corPvalueStudent(moduleTraitCor, nSamples = ncol(adj_matrix))

all(rownames(MEs) == rownames(traits))  # must be TRUE
## [1] TRUE
# Plot correlation heatmap

# Load required libraries
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
# Prepare correlation and p-value matrices
moduleTraitCor <- cor(MEs, traits, use = "p")
moduleTraitPvalue <- corPvalueStudent(moduleTraitCor, nSamples = ncol(adj_matrix))

# Melt into long format for ggplot
df_cor <- melt(moduleTraitCor)
df_pval <- melt(moduleTraitPvalue)

# Combine and format labels
df_combined <- df_cor
df_combined$Pvalue <- df_pval$value
df_combined$Label <- paste0(signif(df_cor$value, 2), "\n(",
                            signif(df_pval$value, 1), ")")

# Optional: set factor levels to preserve order
df_combined$Var1 <- factor(df_combined$Var1, levels = rownames(moduleTraitCor))
df_combined$Var2 <- factor(df_combined$Var2, levels = colnames(moduleTraitCor))

# Plot
ggplot(df_combined, aes(x = Var2, y = Var1, fill = value)) +
  geom_tile(color = "grey90") +
  geom_text(aes(label = Label), size = 3.5) +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0,
                       name = "Correlation") +
  theme_minimal(base_size = 14) +
  labs(x = "Traits", y = "Modules", title = "Module-Trait Relationships") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        panel.grid = element_blank(),
        plot.title = element_text(hjust = 0.5))

ggsave(filename = "WGCNA_ModuleCorr.png",path = "../figures")
## Saving 12 x 9 in image
names(dynamicColors) <- colnames(adj_matrix)
kWithin <- intramodularConnectivity(adjacency_mat, dynamicColors) # Calculate intra-modular connectivity (hubness)

library(readr)

# Function to check if a string is a single valid entry
is_single_valid <- function(x) {
  !is.na(x) & x != "" & !grepl(";", x)
}

for (m in unique(dynamicColors)) {
  module_genes <- names(dynamicColors)[which(dynamicColors == m)]
  
  if (length(module_genes) == 0) next
  
  # Order genes by intramodular connectivity
  hub_genes <- module_genes[order(kWithin[module_genes, "kWithin"], decreasing = TRUE)]
  
  # Extract columns
  gene_symbol <- featuresinfo[hub_genes, "GENE_SYMBOL"]
  ensembl_id <- featuresinfo[hub_genes, "ENSEMBL_GENE_ID"]
  
  # Determine final name using the priority rules
  final_names <- character(length(hub_genes))
  for (i in seq_along(hub_genes)) {
    if (is_single_valid(gene_symbol[i])) {
      final_names[i] <- gene_symbol[i]
    } else if (is_single_valid(ensembl_id[i])) {
      final_names[i] <- ensembl_id[i]
    } else {
      final_names[i] <- hub_genes[i]  # fallback to original name
    }
  }
  
  # Write final names to output
  write_lines(final_names, file = paste0("../outputs/hub_", m, ".txt"))
}

tutorial:

https://sbc.shef.ac.uk/geo_tutorial/tutorial.nb.html